Usenet 1993 July

home *** CD-ROM | disk | FTP | other *** search

/ Usenet 1993 July / InfoMagic USENET CD-ROM July 1993.ISO / sources / unix / volume8 / sp / part02 < prev next >

Wrap

Internet Message Format | 1987-02-19 | 17.8 KB

Subject: v08i077: Soundex spelling checker, Part02/02 Newsgroups: mod.sources Approved: mirror!rs Submitted by: Barry Brachman <brachman@cs.ubc.cdn> Mod.sources: Volume 8, Issue 77 Archive-name: sp/Part02 #! /bin/sh # This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # If all goes well, you will see the message "End of archive 2 (of 2)." # Contents: sp.9 sp.c sp.h sp.ml PATH=/bin:/usr/bin:/usr/ucb; export PATH echo shar: extracting "'sp.9'" '(1112 characters)' if test -f 'sp.9' ; then echo shar: will not over-write existing file "'sp.9'" else sed 's/^X//' >sp.9 <<'@//E*O*F sp.9//' X.TH SP 9-EMACS "8-Aug-1985" X.UC X.SH NAME Xsp, sp-from-buffer \- give possible spellings X.br Xget-next-word \- return word at cursor X.SH SYNOPSIS X.B (sp) X.br X.B (sp-from-buffer) X.br X.B (get-next-word) X.SH DESCRIPTION X.I Sp Xprompts for a word and then invokes the X.B sp Xprogram with the word as an argument. XThe output of the X.B sp Xprogram, a list of words, is placed in a buffer called "sp". XIf the word is found, then the cursor is positioned at the word. XIf the word isn't found, the cursor is left at the start of the buffer. XIn either case the mode line of the buffer indicates whether the word Xwas found. X.sp 2 X.B Sp-from-buffer Xis the same as X.B sp Xexcept that the user is not prompted for the word; the word under or Ximmediately to the left of the cursor is used. X.sp 2 X.B Get-next-word Xreturns the word the cursor is pointing at or the word immediately Xto the left of the cursor if it is between words. X.SH BUFFERS USED Xsp \- result X.SH FILES X/usr/local/lib/emacs/sp.ml X.br X/usr/local/sp X.SH SEE ALSO X.B sp(1-LOCAL) X.SH AUTHOR XBarry Brachman X.br XDept. of Computer Science X.br XUniversity of British Columbia @//E*O*F sp.9// if test 1112 -ne "`wc -c <'sp.9'`"; then echo shar: error transmitting "'sp.9'" '(should have been 1112 characters)' fi fi # end of overwriting check echo shar: extracting "'sp.c'" '(9128 characters)' if test -f 'sp.c' ; then echo shar: will not over-write existing file "'sp.c'" else sed 's/^X//' >sp.c <<'@//E*O*F sp.c//' X/* vi: set tabstop=4 : */ X X/* X * Version 1.3 December 1986 X * X * sp - spell word X * X * Usage: sp [-f dictionary-list] [-eavc] [word ...] X * X * Compute the Soundex code for each word on the command line X * (or each word on the standard input) and compare against a X * dictionary X * X * The soundex dictionary list may be specified on the command line X * The environment variable SPPATH may be set to a list of colon X * separated pathnames of soundex dictionaries. X * If a command line dictionary-list (a colon separated list of pathnames) is X * given in addition to the SPPATH variable, all dictionaries are used. X * X * To reduce the size of the word list, certain heuristics are used: X * the -a option causes all words matched to be printed X * The output is alphabetically sorted and indicators are printed X * beside each word: X * X == exact match X * ! == close match X * * == near match X * ' ' == matched X * X * Note that the maximum number of colliding words is MAXCOUNT due to the X * data structure used. X * X * Permission is given to copy or distribute this program provided you X * do not remove this header or make money off of the program. X * X * Please send comments and suggestions to: X * X * Barry Brachman X * Dept. of Computer Science X * Univ. of British Columbia X * Vancouver, B.C. V6T 1W5 X * X * .. {ihnp4!alberta, uw-beaver}!ubc-vision!ubc-cs!brachman X * brachman@cs.ubc.cdn X * brachman%ubc.csnet@csnet-relay.arpa X * brachman@ubc.csnet X */ X X#include <sys/types.h> X#include <sys/file.h> X#include <ctype.h> X#include <stdio.h> X X#ifdef NEWDBM X#include <ndbm.h> X#else !NEWDBM X#include <dbm.h> X#endif NEWDBM X X#include "sp.h" X X#define streq(X, Y) (!strcmp(X, Y)) X#define range(S) ((strlen(S) + 4) / 5) X X#define USAGE "Usage: sp [-f dictionary-list] [-eavc] [word ...]" X Xchar word[MAXWORDLEN + 2]; X Xdatum FETCH(); X Xchar *fileptr[MAXDICT + 1]; /* Up to MAXDICT dictionaries + sentinel */ Xint dict_ptr = 0; X Xchar *wordptr[MAXWORDS], *wordlistptr; Xchar wordlist[WORDSPACE]; Xint nmatched; X X/* X * Soundex codes X * The program depends upon the numbers zero through six being used X * but this can easily be changed X */ Xchar soundex_code_map[26] = { X/*** A B C D E F G H I J K L M N O P ***/ X 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1, X X/*** Q R S T U V W X Y Z ***/ X 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 X}; X Xint aflag, cflag, eflag, vflag; X Xmain(argc, argv) Xint argc; Xchar **argv; X{ X register int fflag, i; X register char *p; X char *getenv(); X X argc--; argv++; X fileptr[0] = (char *) NULL; X while (argc > 0 && argv[0][0] == '-') { X fflag = 0; /* to break out of following loop... */ X for (i = 1; argv[0][i] != '\0' && fflag == 0; i++) { X switch (argv[0][i]) { X case 'a': X aflag = 1; X break; X case 'c': X cflag = 1; X break; X case 'e': X eflag = 1; X break; X case 'f': X if (argc == 1) { X fprintf(stderr, "%s\n", USAGE); X exit(1); X } X mkfilelist(argv[1]); X argc--; X argv++; X fflag = 1; /* break out of loop */ X break; X case 'v': X vflag = 1; X break; X default: X fprintf(stderr, "%s\n", USAGE); X exit(1); X } X } X argc--, argv++; X } X X if ((p = getenv("SPPATH")) != (char *) NULL) X mkfilelist(p); X if (fileptr[0] == (char *) NULL) X mkfilelist(DEFAULT_SPPATH); X if (vflag) { X printf("Using dictionaries:\n"); X for (i = 0; fileptr[i] != (char *) NULL; i++) X if (strlen(fileptr[i]) > 0) X printf("\t%s\n", fileptr[i]); X } X if (argc) { X for (i = 0; i < argc; i++) { X if (!eflag) X printf("%s:\n", argv[i]); X apply(argv[i]); X if (!eflag) X printf("\n"); X } X } X else { X int ch, len; X X while (1) { X printf("Word? "); X if (fgets(word, sizeof(word), stdin) == (char *) NULL) { X printf("\n"); X break; X } X len = strlen(word); X if (word[len - 1] != '\n') { X fprintf(stderr, "sp: Word too long: %s", word); X while ((ch = getchar()) != '\n') /* flush rest of line */ X putc(ch, stderr); X putc('\n', stderr); X continue; X } X word[--len] = '\0'; X if (len > MAXWORDLEN) { X fprintf(stderr, "sp: Word too long: %s\n", word); X continue; X } X X apply(word); X if (!eflag) X printf("\n"); X } X } X} X X/* X * Apply the Soundex search for a word to each dictionary in turn X * Note that 'DBMINIT' opens both the '.dir' and the '.pag' files X * and we must close them to avoid running out of file descriptors X * X * This routine gets called each time a word is looked up and therefore X * the dbm files may be repeatedly opened and closed. Since the vast majority X * of the time this program is invoked for just a single word it doesn't seem X * worthwhile to do the right thing by saving file descriptors/DBM pointers. X * There probably won't be more than two dictionaries in use anyway. X */ Xapply(word) Xchar *word; X{ X register int code, i, nodicts; X X nmatched = 0; X wordlistptr = wordlist; X if ((code = soundex(word, 3)) == BAD_WORD) X return; X nodicts = 1; X for (i = 0; fileptr[i] != (char *) NULL; i++) { X if (strlen(fileptr[i]) == 0) X continue; X if (DBMINIT(fileptr[i], O_RDONLY) != -1) { X proc(code); X nodicts = 0; X } X DBMCLOSE(); X } X if (nodicts) { X fprintf(stderr, "sp: Can't open any dictionaries\n"); X exit(1); X } X if (vflag && !eflag && nmatched == 0) X printf("%s: no match\n", word); X else X choose(word); X} X X/* X * Look the word up in the current dictionary X * and save all the matches X * Note that only three digits are of the Soundex code are stored X * in a dictionary X */ Xproc(soundex) Xint soundex; X{ X register int c, len; X datum dbm_key, dbm_content; X key_t *key, keyvec[KEYSIZE]; X char *mk_word(), *p; X X key = keyvec; X dbm_key.dptr = (char *) key; X dbm_key.dsize = KEYSIZE; X c = 0; X while (1) { X mk_key(key, soundex, c); X dbm_content = FETCH(dbm_key); X X if (dbm_content.dptr == 0) X break; X X if (IS_DELETED(dbm_content)) { X if (++c > MAXCOUNT) { X fprintf(stderr, "sp: entry count overflow\n"); X exit(1); X } X continue; X } X X if (nmatched == MAXWORDS) { X fprintf(stderr, "sp: Too many matches\n"); X exit(1); X } X X p = mk_word(dbm_content.dptr, dbm_content.dsize, soundex); X len = strlen(p); X if (wordlistptr + len >= &wordlist[WORDSPACE]) { X fprintf(stderr, "sp: Out of space for words\n"); X exit(1); X } X strncpy(wordlistptr, p, len); X wordlistptr[len] = '\0'; X wordptr[nmatched++] = wordlistptr; X wordlistptr += len + 1; X if (++c > MAXCOUNT) { X fprintf(stderr, "sp: entry count overflow\n"); X exit(1); X } X } X} X X/* X * Select and print those words which we consider X * to have matched 'word' X */ Xchoose(word) Xregister char *word; X{ X register int c, code, i, len, mcount, wordlen; X register char *p; X int compar(); X X code = soundex(word, 4); X qsort(wordptr, nmatched, sizeof(char *), compar); X c = range(word); X wordlen = strlen(word); X mcount = 0; X for (i = 0; i < nmatched; i++) { X p = wordptr[i]; X if (strmatch(word, p) == 0) { X printf("X"); X if (eflag) { X printf(" %s\n", word); X return; X } X } X else if (eflag) X continue; X else if (soundex(p, 4) == code) X printf("!"); X else if (aflag && X (wordlen < (len = strlen(p)) - c || len > wordlen + c)) X printf(" "); X else if (!cflag) X printf("*"); X else X continue; X printf("%3d. %s\n", mcount + 1, p); X mcount++; X } X if (vflag) X printf("(%d total matches)\n", nmatched); X} X X/* X * Compute an 'n' digit Soundex code for 'word' X * See mksp.c X */ Xsoundex(word, n) Xregister char *word; Xint n; X{ X register int c, digit_part, previous_code, soundex_length; X register char *p, *w; X char wcopy[MAXWORDLEN + 2]; X X strcpy(wcopy, word); X p = w = wcopy; X while (*p != '\0') { X if (isupper(*p)) X *p = tolower(*p); X p++; X } X if (!isalpha(*w)) { X fprintf(stderr, "sp: Improper word: %s\n", word); X return(BAD_WORD); X } X digit_part = 0; X soundex_length = 0; X previous_code = soundex_code_map[*w - 'a']; X for (p = w + 1; *p != '\0' && soundex_length < n; p++) { X if (!isalpha(*p)) X continue; X c = soundex_code_map[*p - 'a']; X if (c == 0 || previous_code == c) { X previous_code = c; X continue; X } X digit_part = digit_part * 7 + c; X previous_code = c; X soundex_length++; X } X while (soundex_length++ < n) X digit_part *= 7; X return((digit_part << 5) + *w - 'a'); X} X X/* X * Process a path string (environment variable SPPATH, DEFAULT_SPPATH, or an X * arg) by separating the pathnames into strings pointed to by elements X * of 'fileptr' X * End of list indicated by fileptr entry of NULL X * X * No attempt made to ignore duplicate pathnames X */ Xmkfilelist(p) Xregister char *p; X{ X register int len; X register char *path, *start; X char *malloc(); X X while (*p != '\0' && dict_ptr < MAXDICT) { X start = p; X while (*p != ':' && *p != '\0') X p++; X if (start == p && *p == ':') { /* colon with nothing else */ X p++; X continue; X } X len = p - start; X path = (char *) malloc((unsigned) (len + 1)); X if (path == (char *) NULL) { X fprintf(stderr, "sp: Out of dictionary space\n"); X exit(1); X } X strncpy(path, start, len); X path[len] = '\0'; X fileptr[dict_ptr++] = path; X } X fileptr[dict_ptr] = (char *) NULL; X} X Xcompar(p, q) Xchar **p, **q; X{ X X return(strmatch(*p, *q)); X/* return(strcmp(*p, *q)); */ /* use if you prefer case sensitive */ X} X @//E*O*F sp.c// if test 9128 -ne "`wc -c <'sp.c'`"; then echo shar: error transmitting "'sp.c'" '(should have been 9128 characters)' fi fi # end of overwriting check echo shar: extracting "'sp.h'" '(3553 characters)' if test -f 'sp.h' ; then echo shar: will not over-write existing file "'sp.h'" else sed 's/^X//' >sp.h <<'@//E*O*F sp.h//' X/* sp.h */ X/* vi: set tabstop=4 : */ X X/* X * A deleted dbm entry is denoted by a dsize of zero X */ X#define IS_DELETED(C) (C.dsize == 0) X X/* X * Because the soundex code (part of the key) includes the first character of X * the word, we don't need to store the first character again with the content. X * To do this we treat the first byte of the content stored in the dbm X * specially: we rip off the two high order bits of the first byte of X * the content and therefore have to restrict the value of the second X * character of the word. We use 'a' == 0, 'z' == 25, 'A' == 26, 'Z' == 51. X * See spchar_map[] (misc.c) for the mapping of codes 52 through 63. X * This behaviour is isolated in tospchar() and fromspchar(). X * If spchar_map is changed you should change the man page too. X * X * The word can be reconstructed by extracting the first character of the word X * from the soundex code and then looking at the first byte of the content. X * If the UPPER_CHAR bit is on in the first byte of the content then the first X * character of the word should be upper case. X * The length of the content reflects the actual number of bytes stored in the X * dbm. Words that have been deleted from the dbm are stored with a length of X * zero. Because of this, words of length 1 are treated differently: they are X * stored with a length of 1 and with the SINGLE_CHAR bit set. Words with X * original length > 1 will have (length - 1) bytes stored in the content. X * Clear? X */ X#define IS_VALID(w) (isalpha(*w) && (*(w+1) == '\0' || isalpha(*(w+1)) \ X || tospchar(*(w+1)) != '\0')) X#define UPPER_CHAR 0200 /* 1st char of word is upper case */ X#define SINGLE_CHAR 0100 /* single char word */ X#define MASK_CHAR 0077 /* mask out the indicator bits */ X#define QUOTE_CHAR 0064 /* (52) code for single quote */ X#define AMPER_CHAR 0065 /* (53) for ampersand */ X#define PERIOD_CHAR 0066 /* (54) for period */ X#define SPACE_CHAR 0067 /* (55) for blank */ X X/* X * Map for first byte of dbm content (special characters) X * Terminated by a null entry X */ Xstruct spchar_map { X char spchar; X char code; X}; X X#define MAXDICT 10 /* Max number of dictionaries to use */ X#define MAXWORDLEN 50 /* Max word length */ X#define MAXWORDS 400 /* Max number of words in one sp query */ X#define WORDSPACE 20480 /* Max space used words for one sp query */ X X/* X * This is the default path used by sp to find dictionaries X * Adjust for local conditions X */ X#define DEFAULT_SPPATH "/usr/local/lib/sp.dict.1:/usr/local/lib/sp.dict.2" X X/* X * The following must be the maximum value containable in the count part of X * a key. X * It must be always be less than: (the maximum positive value that can be X * contained in an int) - 1 X * This value imposes a limit on the number of words in a dictionary having the X * same soundex code. For /usr/dict/words (~25K words), a count of 255 is X * sufficient. Larger dictionaries will need more. In any case you can X * always just make another dictionary and split up your words. X * You might want to adjust MAXWORDS and WORDSPACE (above) to reflect MAXCOUNT X * if you've got plenty of memory. X */ X#define MAXCOUNT 1023 /* 2^10 - 1 */ X X/* X * The key used by dbm looks like this: X * X * <10 bits> <5 bits> <9 bits> X * counter first char soundex X * X * A soundex value is treated as a base 7 number (maximum is 666, base 7). X */ X#define KEYSIZE 3 /* in bytes */ Xtypedef unsigned char key_t; X X#define BAD_WORD -1 /* This must be an illegal Soundex */ X#define NO_MATCH 0 X#define MATCHED 1 X Xextern char soundex_code_map[]; X @//E*O*F sp.h// if test 3553 -ne "`wc -c <'sp.h'`"; then echo shar: error transmitting "'sp.h'" '(should have been 3553 characters)' fi fi # end of overwriting check echo shar: extracting "'sp.ml'" '(1613 characters)' if test -f 'sp.ml' ; then echo shar: will not over-write existing file "'sp.ml'" else sed 's/^X//' >sp.ml <<'@//E*O*F sp.ml//' X; X; Written by Donald Acton and Barry Brachman with help from Marc Majka X; March 11/85 X; X; Dept. of Computer Science X; University of British Columbia X; X X(defun X (sp spword X (setq spword (get-tty-string "Word: " )) X (do-sp spword) X ) X) X X(defun ; bjb X (sp-from-buffer spword X (setq spword (get-next-word)) X (message (concat "Word: " spword)) X (sit-for 0) X (do-sp spword) X ) X) X X(defun X (do-sp curr found tmp X (setq curr (current-buffer-name)) X (pop-to-buffer "sp") X (setq needs-checkpointing 0) X (erase-buffer) X (set-mark) X (filter-region (concat "sp " spword)) X (beginning-of-file) X (setq found " *not found*") X (setq tmp case-fold-search) X (setq case-fold-search 1) X (error-occurred (re-search-forward (concat "\. " spword "$")) X (beginning-of-line) X (line-to-top-of-window) X (setq found " *found*")) X (setq case-fold-search tmp) X (setq mode-line-format " %b - %m %p") X (setq mode-string (concat spword found)) X (pop-to-buffer curr) X (novalue))) X X; X; Return the word the cursor is pointing at X; or the word immediately to the left of the X; cursor if it is between words X; X; April 15/85 - bjb X; X(defun X (get-next-word original-dot rb spword X (save-excursion X (setq original-dot (dot)) X (backward-word) (forward-word) (setq rb (dot)) X (if (> original-dot rb) X (progn (forward-word) (backward-word)) X (backward-word)) X (set-mark) X (forward-word) X (setq spword (region-to-string)) X ) X spword X ) X) X @//E*O*F sp.ml// if test 1613 -ne "`wc -c <'sp.ml'`"; then echo shar: error transmitting "'sp.ml'" '(should have been 1613 characters)' fi fi # end of overwriting check echo shar: "End of archive 2 (of 2)." cp /dev/null ark2isdone DONE=true for I in 1 2; do if test ! -f ark${I}isdone; then echo "You still need to run archive ${I}." DONE=false fi done case $DONE in true) echo "You have run both archives." echo 'See the README' ;; esac ## End of shell archive. exit 0